Questions of Interest:
1. How have COVID-19 cases and deaths evolved over time in different
regions of the United States?
2. Are there patterns in population-adjusted case and death rates that
reveal disparities between states?
Data was obtained from GitHub and was provided by Johns Hopkins GitHub. Data source can be found using the following URL: https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv
Description:
The dataset includes time-series data on COVID-19 confirmed cases and
deaths for both the United States and global regions. Columns represent
dates, and rows represent different regions or administrative
divisions.
Importing Libraries
# Importing the necessary libraries for analysis
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
Data Loading
# Defining the URL as show during week 3 lecture
url_in <- "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/"
# Defining names of each CSV file
file_names <- c(
"time_series_covid19_confirmed_global.csv",
"time_series_covid19_deaths_global.csv",
"time_series_covid19_confirmed_US.csv",
"time_series_covid19_deaths_US.csv"
)
# Concatenating the URL and files together
urls <- str_c(url_in, file_names, sep = "")
#Reading the CSV files
global_cases <- read_csv(urls[1], show_col_types = FALSE)
global_deaths <- read_csv(urls[2], show_col_types = FALSE)
US_cases <- read_csv(urls[3], show_col_types = FALSE)
US_deaths <- read_csv(urls[4], show_col_types = FALSE)
# Preview the first few rows of global_cases
head(global_cases)
Province/State <chr> | Country/Region <chr> | Lat <dbl> | Long <dbl> | 1/22/20 <dbl> | 1/23/20 <dbl> | 1/24/20 <dbl> | 1/25/20 <dbl> | 1/26/20 <dbl> | 1/27/20 <dbl> | |
|---|---|---|---|---|---|---|---|---|---|---|
| NA | Afghanistan | 33.93911 | 67.70995 | 0 | 0 | 0 | 0 | 0 | 0 | |
| NA | Albania | 41.15330 | 20.16830 | 0 | 0 | 0 | 0 | 0 | 0 | |
| NA | Algeria | 28.03390 | 1.65960 | 0 | 0 | 0 | 0 | 0 | 0 | |
| NA | Andorra | 42.50630 | 1.52180 | 0 | 0 | 0 | 0 | 0 | 0 | |
| NA | Angola | -11.20270 | 17.87390 | 0 | 0 | 0 | 0 | 0 | 0 | |
| NA | Antarctica | -71.94990 | 23.34700 | 0 | 0 | 0 | 0 | 0 | 0 |
Data Tidying & Transformation
# Pivoting columns
global_cases <- global_cases %>%
pivot_longer(
cols = -c(`Province/State`, `Country/Region`, Lat, Long),
names_to = "date",
values_to = "cases"
) %>%
select(-c(Lat, Long))
global_deaths <- global_deaths %>%
pivot_longer(
cols = -c(`Province/State`, `Country/Region`, Lat, Long),
names_to = "date",
values_to = "deaths"
) %>%
select(-c(Lat, Long))
# Viewing data to ensure transformation was effective
head(global_cases)
Province/State <chr> | Country/Region <chr> | date <chr> | cases <dbl> | |
|---|---|---|---|---|
| NA | Afghanistan | 1/22/20 | 0 | |
| NA | Afghanistan | 1/23/20 | 0 | |
| NA | Afghanistan | 1/24/20 | 0 | |
| NA | Afghanistan | 1/25/20 | 0 | |
| NA | Afghanistan | 1/26/20 | 0 | |
| NA | Afghanistan | 1/27/20 | 0 |
head(global_deaths)
Province/State <chr> | Country/Region <chr> | date <chr> | deaths <dbl> | |
|---|---|---|---|---|
| NA | Afghanistan | 1/22/20 | 0 | |
| NA | Afghanistan | 1/23/20 | 0 | |
| NA | Afghanistan | 1/24/20 | 0 | |
| NA | Afghanistan | 1/25/20 | 0 | |
| NA | Afghanistan | 1/26/20 | 0 | |
| NA | Afghanistan | 1/27/20 | 0 |
# Combining global_cases and global_deaths into one global variable
global <- global_cases %>%
full_join(global_deaths) %>%
rename(
Country_Region = `Country/Region`,
Province_State = `Province/State`
) %>%
mutate(date = mdy(date))
## Joining with `by = join_by(`Province/State`, `Country/Region`, date)`
# Viewing data to ensure accuracty
head(global)
Province_State <chr> | Country_Region <chr> | date <date> | cases <dbl> | deaths <dbl> |
|---|---|---|---|---|
| NA | Afghanistan | 2020-01-22 | 0 | 0 |
| NA | Afghanistan | 2020-01-23 | 0 | 0 |
| NA | Afghanistan | 2020-01-24 | 0 | 0 |
| NA | Afghanistan | 2020-01-25 | 0 | 0 |
| NA | Afghanistan | 2020-01-26 | 0 | 0 |
| NA | Afghanistan | 2020-01-27 | 0 | 0 |
global <- global %>% filter(cases > 0)
summary(global)
## Province_State Country_Region date cases
## Length:306827 Length:306827 Min. :2020-01-22 Min. : 1
## Class :character Class :character 1st Qu.:2020-12-12 1st Qu.: 1316
## Mode :character Mode :character Median :2021-09-16 Median : 20365
## Mean :2021-09-11 Mean : 1032863
## 3rd Qu.:2022-06-15 3rd Qu.: 271281
## Max. :2023-03-09 Max. :103802702
## deaths
## Min. : 0
## 1st Qu.: 7
## Median : 214
## Mean : 14405
## 3rd Qu.: 3665
## Max. :1123836
# Looking further into US cases
head(US_cases)
UID <dbl> | iso2 <chr> | iso3 <chr> | code3 <dbl> | FIPS <dbl> | Admin2 <chr> | Province_State <chr> | Country_Region <chr> | Lat <dbl> | Long_ <dbl> | |
|---|---|---|---|---|---|---|---|---|---|---|
| 84001001 | US | USA | 840 | 1001 | Autauga | Alabama | US | 32.53953 | -86.64408 | |
| 84001003 | US | USA | 840 | 1003 | Baldwin | Alabama | US | 30.72775 | -87.72207 | |
| 84001005 | US | USA | 840 | 1005 | Barbour | Alabama | US | 31.86826 | -85.38713 | |
| 84001007 | US | USA | 840 | 1007 | Bibb | Alabama | US | 32.99642 | -87.12511 | |
| 84001009 | US | USA | 840 | 1009 | Blount | Alabama | US | 33.98211 | -86.56791 | |
| 84001011 | US | USA | 840 | 1011 | Bullock | Alabama | US | 32.10031 | -85.71266 |
# Pivoting the dataset from wide to long format
US_cases %>%
pivot_longer(
cols = -(UID:Combined_Key),
names_to = "date",
values_to = "cases"
)
UID <dbl> | iso2 <chr> | iso3 <chr> | code3 <dbl> | FIPS <dbl> | Admin2 <chr> | Province_State <chr> | Country_Region <chr> | Lat <dbl> | Long_ <dbl> | |
|---|---|---|---|---|---|---|---|---|---|---|
| 84001001 | US | USA | 840 | 1001 | Autauga | Alabama | US | 32.53953 | -86.64408 | |
| 84001001 | US | USA | 840 | 1001 | Autauga | Alabama | US | 32.53953 | -86.64408 | |
| 84001001 | US | USA | 840 | 1001 | Autauga | Alabama | US | 32.53953 | -86.64408 | |
| 84001001 | US | USA | 840 | 1001 | Autauga | Alabama | US | 32.53953 | -86.64408 | |
| 84001001 | US | USA | 840 | 1001 | Autauga | Alabama | US | 32.53953 | -86.64408 | |
| 84001001 | US | USA | 840 | 1001 | Autauga | Alabama | US | 32.53953 | -86.64408 | |
| 84001001 | US | USA | 840 | 1001 | Autauga | Alabama | US | 32.53953 | -86.64408 | |
| 84001001 | US | USA | 840 | 1001 | Autauga | Alabama | US | 32.53953 | -86.64408 | |
| 84001001 | US | USA | 840 | 1001 | Autauga | Alabama | US | 32.53953 | -86.64408 | |
| 84001001 | US | USA | 840 | 1001 | Autauga | Alabama | US | 32.53953 | -86.64408 |
head(US_cases)
UID <dbl> | iso2 <chr> | iso3 <chr> | code3 <dbl> | FIPS <dbl> | Admin2 <chr> | Province_State <chr> | Country_Region <chr> | Lat <dbl> | Long_ <dbl> | |
|---|---|---|---|---|---|---|---|---|---|---|
| 84001001 | US | USA | 840 | 1001 | Autauga | Alabama | US | 32.53953 | -86.64408 | |
| 84001003 | US | USA | 840 | 1003 | Baldwin | Alabama | US | 30.72775 | -87.72207 | |
| 84001005 | US | USA | 840 | 1005 | Barbour | Alabama | US | 31.86826 | -85.38713 | |
| 84001007 | US | USA | 840 | 1007 | Bibb | Alabama | US | 32.99642 | -87.12511 | |
| 84001009 | US | USA | 840 | 1009 | Blount | Alabama | US | 33.98211 | -86.56791 | |
| 84001011 | US | USA | 840 | 1011 | Bullock | Alabama | US | 32.10031 | -85.71266 |
US_cases <- US_cases %>%
pivot_longer(
cols = -(UID:Combined_Key),
names_to = "date",
values_to = "cases"
) %>%
select(Admin2:cases) %>%
mutate(date = mdy(date)) %>%
select(-c(Lat, Long_))
head(US_cases)
Admin2 <chr> | Province_State <chr> | Country_Region <chr> | Combined_Key <chr> | date <date> | cases <dbl> |
|---|---|---|---|---|---|
| Autauga | Alabama | US | Autauga, Alabama, US | 2020-01-22 | 0 |
| Autauga | Alabama | US | Autauga, Alabama, US | 2020-01-23 | 0 |
| Autauga | Alabama | US | Autauga, Alabama, US | 2020-01-24 | 0 |
| Autauga | Alabama | US | Autauga, Alabama, US | 2020-01-25 | 0 |
| Autauga | Alabama | US | Autauga, Alabama, US | 2020-01-26 | 0 |
| Autauga | Alabama | US | Autauga, Alabama, US | 2020-01-27 | 0 |
US_deaths <- US_deaths %>%
pivot_longer(
cols = starts_with("1/"), # or "2/" if the dates start with February (adjust as needed)
names_to = "date",
values_to = "deaths"
) %>%
mutate(date = mdy(date)) %>% # Ensure `date` is converted to a Date object
select(Admin2, Province_State, Country_Region, Combined_Key, date, deaths)
# View the transformed dataset
head(US_deaths)
Admin2 <chr> | Province_State <chr> | Country_Region <chr> | Combined_Key <chr> | date <date> | deaths <dbl> |
|---|---|---|---|---|---|
| Autauga | Alabama | US | Autauga, Alabama, US | 2020-01-22 | 0 |
| Autauga | Alabama | US | Autauga, Alabama, US | 2020-01-23 | 0 |
| Autauga | Alabama | US | Autauga, Alabama, US | 2020-01-24 | 0 |
| Autauga | Alabama | US | Autauga, Alabama, US | 2020-01-25 | 0 |
| Autauga | Alabama | US | Autauga, Alabama, US | 2020-01-26 | 0 |
| Autauga | Alabama | US | Autauga, Alabama, US | 2020-01-27 | 0 |
# Combining datasets
US <- US_cases %>% full_join(US_deaths)
## Joining with `by = join_by(Admin2, Province_State, Country_Region,
## Combined_Key, date)`
head(US)
Admin2 <chr> | Province_State <chr> | Country_Region <chr> | Combined_Key <chr> | date <date> | cases <dbl> | deaths <dbl> |
|---|---|---|---|---|---|---|
| Autauga | Alabama | US | Autauga, Alabama, US | 2020-01-22 | 0 | 0 |
| Autauga | Alabama | US | Autauga, Alabama, US | 2020-01-23 | 0 | 0 |
| Autauga | Alabama | US | Autauga, Alabama, US | 2020-01-24 | 0 | 0 |
| Autauga | Alabama | US | Autauga, Alabama, US | 2020-01-25 | 0 | 0 |
| Autauga | Alabama | US | Autauga, Alabama, US | 2020-01-26 | 0 | 0 |
| Autauga | Alabama | US | Autauga, Alabama, US | 2020-01-27 | 0 | 0 |
global <- global %>%
unite(
"Combined_Key",
c(Province_State, Country_Region),
sep = ", ",
na.rm = TRUE,
remove = FALSE
)
global
Combined_Key <chr> | Province_State <chr> | Country_Region <chr> | date <date> | cases <dbl> | deaths <dbl> |
|---|---|---|---|---|---|
| Afghanistan | NA | Afghanistan | 2020-02-24 | 5 | 0 |
| Afghanistan | NA | Afghanistan | 2020-02-25 | 5 | 0 |
| Afghanistan | NA | Afghanistan | 2020-02-26 | 5 | 0 |
| Afghanistan | NA | Afghanistan | 2020-02-27 | 5 | 0 |
| Afghanistan | NA | Afghanistan | 2020-02-28 | 5 | 0 |
| Afghanistan | NA | Afghanistan | 2020-02-29 | 5 | 0 |
| Afghanistan | NA | Afghanistan | 2020-03-01 | 5 | 0 |
| Afghanistan | NA | Afghanistan | 2020-03-02 | 5 | 0 |
| Afghanistan | NA | Afghanistan | 2020-03-03 | 5 | 0 |
| Afghanistan | NA | Afghanistan | 2020-03-04 | 5 | 0 |
uid_lookup_url <- "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/UID_ISO_FIPS_LookUp_Table.csv"
uid <- read_csv(uid_lookup_url) %>%
select(-c(Lat, Long_, Combined_Key, code3, iso2, iso3, Admin2))
## Rows: 4321 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): iso2, iso3, FIPS, Admin2, Province_State, Country_Region, Combined_Key
## dbl (5): UID, code3, Lat, Long_, Population
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
global <- global %>%
left_join(uid, by = c("Province_State", "Country_Region")) %>%
select(-c(UID, FIPS)) %>%
select(Province_State, Country_Region, date, cases, deaths, Population, Combined_Key)
global
Province_State <chr> | Country_Region <chr> | date <date> | cases <dbl> | deaths <dbl> | Population <dbl> | Combined_Key <chr> |
|---|---|---|---|---|---|---|
| NA | Afghanistan | 2020-02-24 | 5 | 0 | 38928341 | Afghanistan |
| NA | Afghanistan | 2020-02-25 | 5 | 0 | 38928341 | Afghanistan |
| NA | Afghanistan | 2020-02-26 | 5 | 0 | 38928341 | Afghanistan |
| NA | Afghanistan | 2020-02-27 | 5 | 0 | 38928341 | Afghanistan |
| NA | Afghanistan | 2020-02-28 | 5 | 0 | 38928341 | Afghanistan |
| NA | Afghanistan | 2020-02-29 | 5 | 0 | 38928341 | Afghanistan |
| NA | Afghanistan | 2020-03-01 | 5 | 0 | 38928341 | Afghanistan |
| NA | Afghanistan | 2020-03-02 | 5 | 0 | 38928341 | Afghanistan |
| NA | Afghanistan | 2020-03-03 | 5 | 0 | 38928341 | Afghanistan |
| NA | Afghanistan | 2020-03-04 | 5 | 0 | 38928341 | Afghanistan |
uid_lookup_url <- "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/UID_ISO_FIPS_LookUp_Table.csv"
uid <- read_csv(uid_lookup_url)
## Rows: 4321 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): iso2, iso3, FIPS, Admin2, Province_State, Country_Region, Combined_Key
## dbl (5): UID, code3, Lat, Long_, Population
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(uid)
UID <dbl> | iso2 <chr> | iso3 <chr> | code3 <dbl> | FIPS <chr> | Admin2 <chr> | Province_State <chr> | Country_Region <chr> | Lat <dbl> | Long_ <dbl> | |
|---|---|---|---|---|---|---|---|---|---|---|
| 4 | AF | AFG | 4 | NA | NA | NA | Afghanistan | 33.93911 | 67.70995 | |
| 8 | AL | ALB | 8 | NA | NA | NA | Albania | 41.15330 | 20.16830 | |
| 10 | AQ | ATA | 10 | NA | NA | NA | Antarctica | -71.94990 | 23.34700 | |
| 12 | DZ | DZA | 12 | NA | NA | NA | Algeria | 28.03390 | 1.65960 | |
| 20 | AD | AND | 20 | NA | NA | NA | Andorra | 42.50630 | 1.52180 | |
| 24 | AO | AGO | 24 | NA | NA | NA | Angola | -11.20270 | 17.87390 |
US <- US %>%
left_join(uid, by = c("Province_State" = "Province_State",
"Country_Region" = "Country_Region",
"Admin2" = "Admin2")) # adjust if needed
state_data <- US %>%
group_by(Province_State) %>%
summarize(
total_cases = sum(cases, na.rm = TRUE),
total_deaths = sum(deaths, na.rm = TRUE),
population = first(Population) # now exists
) %>%
mutate(death_rate = total_deaths / population * 100000) %>%
arrange(desc(death_rate)) %>%
slice_head(n = 5)
Data Analysis & Viz
# Summarizing total cases over time
cases_over_time <- US %>%
group_by(date) %>%
summarize(total_cases = sum(cases, na.rm = TRUE))
ggplot(cases_over_time, aes(x = date, y = total_cases)) +
geom_line(color = "blue", linewidth = 1) + # Updated aesthetic
labs(
title = "Total COVID-19 Cases Over Time in the US",
x = "Date",
y = "Total Cases"
) +
theme_minimal()
# Summarizing state-level data
ggplot(state_data, aes(x = reorder(Province_State, death_rate), y = death_rate)) +
geom_bar(stat = "identity") +
labs(
title = "COVID-19 Death Rates per 100,000 by State",
x = "State",
y = "Death Rate per 100,000"
) +
theme_minimal() +
coord_flip()
# 1. linear model: Deaths ~ Cases
model_simple1 <- lm(deaths ~ cases, data = US)
# 2. Base R summary
summary_simple1 <- summary(model_simple1)
print(summary_simple1)
##
## Call:
## lm(formula = deaths ~ cases, data = US)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5580.8 -28.0 -20.6 5.7 25655.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.801e+01 5.255e-01 53.3 <2e-16 ***
## cases 1.068e-02 7.121e-06 1499.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 299.8 on 344224 degrees of freedom
## (3475680 observations deleted due to missingness)
## Multiple R-squared: 0.8673, Adjusted R-squared: 0.8673
## F-statistic: 2.25e+06 on 1 and 344224 DF, p-value: < 2.2e-16
simple1_coefs <- as.data.frame(summary_simple1$coefficients) %>%
rownames_to_column("term") %>%
rename(
estimate = "Estimate",
std.error = "Std. Error",
statistic = "t value",
p.value = "Pr(>|t|)"
)
cat("\nModel 1 Coefficients:\n")
##
## Model 1 Coefficients:
print(simple1_coefs)
## term estimate std.error statistic p.value
## 1 (Intercept) 28.00920121 5.255336e-01 53.29669 0
## 2 cases 0.01068099 7.121328e-06 1499.85913 0
ggplot(US, aes(x = cases, y = deaths)) +
geom_point(alpha = 0.3, color = "blue") +
geom_smooth(method = "lm", color = "red", se = FALSE) +
labs(
title = "Model 1: Deaths vs. Cases",
x = "Cases",
y = "Deaths"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 3475680 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 3475680 rows containing missing values or values outside the scale
## range (`geom_point()`).
# 1. Simple linear model: Deaths ~ Population
model_simple2 <- lm(deaths ~ Population, data = US)
# 2. Base R summary
summary_simple2 <- summary(model_simple2)
print(summary_simple2)
##
## Call:
## lm(formula = deaths ~ Population, data = US)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21037.9 -28.5 3.4 41.4 14243.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.998e-01 7.995e-01 -1.125 0.26
## Population 2.096e-03 2.319e-06 903.762 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 440.2 on 332997 degrees of freedom
## (3486907 observations deleted due to missingness)
## Multiple R-squared: 0.7104, Adjusted R-squared: 0.7104
## F-statistic: 8.168e+05 on 1 and 332997 DF, p-value: < 2.2e-16
# 3. Tidy-like data frame
simple2_coefs <- as.data.frame(summary_simple2$coefficients) %>%
rownames_to_column("term") %>%
rename(
estimate = "Estimate",
std.error = "Std. Error",
statistic = "t value",
p.value = "Pr(>|t|)"
)
cat("\nModel 2 Coefficients:\n")
##
## Model 2 Coefficients:
print(simple2_coefs)
## term estimate std.error statistic p.value
## 1 (Intercept) -0.899778660 7.994692e-01 -1.12547 0.2603907
## 2 Population 0.002095681 2.318842e-06 903.76187 0.0000000
# 4. (Optional) Quick Plot
ggplot(US, aes(x = Population, y = deaths)) +
geom_point(alpha = 0.3, color = "blue") +
geom_smooth(method = "lm", color = "red", se = FALSE) +
labs(
title = "Model 2: Deaths vs. Population",
x = "Population",
y = "Deaths"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 3486907 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 3486907 rows containing missing values or values outside the scale
## range (`geom_point()`).
The analysis provides a broad overview showing that both cases and population are positively associated with deaths. Future work could involve more sophisticated models that incorporate time lags, demographic variables, or regional factors to gain a deeper understanding of how COVID-19 impacted different areas.